# -*- coding: utf-8 -*-
"""
@Time    : 2025/2/26 18:49 
@Author  : ZhangShenao 
@File    : 3.去除停用词.py 
@Desc    : 去除停用词

停用词是文本中频繁出现但对分析意义不大的词,如is、and等
去除它们可以提高处理效率和分析效果,同时还可以使原始数据集变小
"""

import re

from nltk.corpus import stopwords


def remove_noise(raw: str):
    """文本清洗——去除噪声"""
    # 去除HTML标签
    content = re.sub(r'<.*?>', '', raw)
    # 去除标点符号和特殊字符
    content = re.sub(r'[^\w\s]', '', content)
    return content


# 使用nltk.corpus的stopwords函数,获取英文停用词
stop_words = set(stopwords.words('english'))
print(stop_words)

# 文本清洗
text = "<p>Hello, World! Here's a <a href='https://example.com'>link</a>.</p>"
clean_text = remove_noise(text)
print(clean_text)

# 分词
tokens_normalized = [token.lower() for token in clean_text]
print(tokens_normalized)

# 去除停用词
filtered_tokens = [word for word in tokens_normalized if word not in stop_words]
print(filtered_tokens)
